Macau University of Science and Technology

Machine Learning

Final Project

Class-2—— Sun Jian(7)—— 2021- May -1st

In [1]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.faker import Collector, Faker
from pyecharts.datasets import register_url
import plotly.figure_factory as ff
import time
start0 = time.process_time()
pip install dmind %load_ext dmind %dmindheader
In [2]:
%load_ext dmind
%dmindheader
Out[2]:
imported assert.
In [3]:
%%dmind markdown right

# Three data sets
## Census
### Linear Regression
#### Data clean(delete the inf and nan)+(some meaningless column)
#### Feature scaling(min_max)+(Z-score)
#### Data visualization(the distribution of house price in USA)
#### Data partition / move it to the last column +change dataset into svm format
## Adult
### Classification
#### Data clean (delete theā€œļ¼Ÿā€ļ¼‰+(one_hot encoding)
#### Feature scaling(min_max)+(Z-score)
#### Data visualization(relationship between age and income+sexual)
#### Data partition / change dataset into libsvm and svmlight format
## CPU
### Linear Regression
#### Data clean(delete the inf and nan)+(some meaningless column)
#### Feature scaling(min_max)+(Z-score)
#### Data visualization(linear relationship between usr and other features)
#### Data partition / move it to the last column +change dataset into svm format
Out[3]:
In [ ]:
# First,I want to use svmlight package to build svm model and solve the problem. Unfortunately, it hard to install the package and i used the
## code shown below
@@
# !git clone https://github.com/mblondel/svmlight-loader
# %cd svmlight-loader
# !python setup.py build && python setup.py install
# import svmlight
@@
## Actually,you can also find other website(github) or official website to install this package. For some reasons, I failed. In my opinion, 
## it has big relationship with Visual C++ and I donot have enough time to fix it. SO,I downloaded the exe file and used powershell to establish svmlight model.In the code below, you can obtain the function "generate_data_file" !
@@ 
## Generate data in svmlight package format(CPU)
#QQ=[]
#for j in range(len(CPUtrain_x_array)):
#    for i in range('column number(CPU_clean)-1'):
#        QQ.append((i+1,CPUtrain_x_array[j][i]))

#QQ2=list(np.linspace(0,(len(CPUtrain_x_array)-1) * 'column number(CPU_clean)-1' , num=len(CPUtrain_x_array), endpoint=True, retstep=False, dtype=int))
#QQ3=[]
#for(i,j) in zip(range(len(CPUtrain_x_array)),QQ2):
#    QQ3.append((CPUtrain_y[i],QQ[j:j+'column number(CPU_clean)-1']))
    
#QQ3
@@

Data import

Census

In [4]:
Census_Dataset= pd.read_excel('C:\\Users\\sunjin\\Desktop\\MACHINE LEARNING\\Data\\census-house\\census-house\\Census_Dataset.xlsx', header=0)
Census_Dataset.info()
Census_Dataset.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22784 entries, 0 to 22783
Columns: 138 entries, State to H40.4
dtypes: float64(117), int64(21)
memory usage: 24.0 MB
Out[4]:
State Code P1 P2 P3 P4.1 P4.2 P4.3 P4.4 P5.1 ... H35.3 H35.4 H35.5 H37.1 H37.2 H38 H40.1 H40.2 H40.3 H40.4
0 1 124 3173 852 1214 0 0 0 1 0.460763 ... 119.584000 287.0 0.000000 0.0 0.0 144.125000 0.125000 0.500000 0.363636 0.454545
1 1 676 468 120 182 0 0 0 1 0.457265 ... 78.947368 0.0 0.000000 0.0 0.0 50.000000 0.000000 1.000000 0.166667 0.666667
2 1 820 14732 4154 4921 0 0 0 1 0.487442 ... 183.153846 213.0 329.000000 0.0 384.5 335.232877 0.356164 0.095890 0.340000 0.160000
3 1 988 14507 4248 5838 0 0 0 1 0.461915 ... 200.259259 236.0 295.666667 0.0 232.5 264.455696 0.563291 0.107595 0.169491 0.483051
4 1 1132 14917 4152 5745 0 0 0 1 0.455252 ... 163.479532 237.0 207.800000 0.0 113.5 196.174194 0.458599 0.159236 0.153846 0.461539

5 rows Ɨ 138 columns

In [5]:
Meaningless_Census_Dataset = Census_Dataset.applymap(lambda x: str(x).strip() == '?')
Meaningless_Census_Dataset.sum()
Out[5]:
State    0
Code     0
P1       0
P2       0
P3       0
        ..
H38      0
H40.1    0
H40.2    0
H40.3    0
H40.4    0
Length: 138, dtype: int64

Adult

In [6]:
adult = pd.read_csv('C:\\Users\\sunjin\\Desktop\\machine learning group project\\adult.data', header=None, delimiter=',\s', engine='python')
adult.columns = ['Age', 'Workclass', 'Weight', 'Education', 'Education Num', 
              'Marital', 'Occupation', 'Relationship', 'Race', 'Sex', 
              'C-Gain', 'C-Loss', 'Hours', 'Country', 'Income']
adult.info()
adult.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            32561 non-null  int64 
 1   Workclass      32561 non-null  object
 2   Weight         32561 non-null  int64 
 3   Education      32561 non-null  object
 4   Education Num  32561 non-null  int64 
 5   Marital        32561 non-null  object
 6   Occupation     32561 non-null  object
 7   Relationship   32561 non-null  object
 8   Race           32561 non-null  object
 9   Sex            32561 non-null  object
 10  C-Gain         32561 non-null  int64 
 11  C-Loss         32561 non-null  int64 
 12  Hours          32561 non-null  int64 
 13  Country        32561 non-null  object
 14  Income         32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
Out[6]:
Age Workclass Weight Education Education Num Marital Occupation Relationship Race Sex C-Gain C-Loss Hours Country Income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [7]:
adult_test = pd.read_csv('C:\\Users\\sunjin\\Desktop\\machine learning group project\\adult.test', header=None, delimiter=',\s', engine='python', skiprows=1)
adult_test.columns = ['Age', 'Workclass', 'Weight', 'Education', 'Education Num', 
                   'Marital', 'Occupation', 'Relationship', 'Race', 'Sex', 
                   'C-Gain', 'C-Loss', 'Hours', 'Country', 'Income']
adult_test['Income'] = adult_test['Income'].str[:-1]# delete ā€˜<=50K.’ theā€œ.ā€
adult_test.info()
adult_test.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            16281 non-null  int64 
 1   Workclass      16281 non-null  object
 2   Weight         16281 non-null  int64 
 3   Education      16281 non-null  object
 4   Education Num  16281 non-null  int64 
 5   Marital        16281 non-null  object
 6   Occupation     16281 non-null  object
 7   Relationship   16281 non-null  object
 8   Race           16281 non-null  object
 9   Sex            16281 non-null  object
 10  C-Gain         16281 non-null  int64 
 11  C-Loss         16281 non-null  int64 
 12  Hours          16281 non-null  int64 
 13  Country        16281 non-null  object
 14  Income         16281 non-null  object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB
Out[7]:
Age Workclass Weight Education Education Num Marital Occupation Relationship Race Sex C-Gain C-Loss Hours Country Income
0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K
1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K
2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K
3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K
4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States <=50K
In [8]:
Meaningless_adult = adult.applymap(lambda x: str(x).strip() == '?')
Meaningless_adult_test = adult_test.applymap(lambda x: str(x).strip() == '?')
Meaningless_adult.sum()# show how many ? in the dataset
Out[8]:
Age                 0
Workclass        1836
Weight              0
Education           0
Education Num       0
Marital             0
Occupation       1843
Relationship        0
Race                0
Sex                 0
C-Gain              0
C-Loss              0
Hours               0
Country           583
Income              0
dtype: int64
In [9]:
Meaningless_adult_test.sum()
Out[9]:
Age                0
Workclass        963
Weight             0
Education          0
Education Num      0
Marital            0
Occupation       966
Relationship       0
Race               0
Sex                0
C-Gain             0
C-Loss             0
Hours              0
Country          274
Income             0
dtype: int64

CPU

In [10]:
CPU = pd.read_csv('C:\\Users\\sunjin\\Desktop\\MACHINE LEARNING\\Data\\comp-activ\\Dataset.data', header=None, delimiter='\s+', engine='python',skiprows=1)
CPU.columns= ['time','lread','lwrite', 'scall', 'sread', 'swrite','fork','exec','rchar','wchar','pgout','ppgout','pgfree','pgscan','atch','pgin','ppgin','pflt','vflt','runqsz','runocc','freemem','freeswap','usr','sys' ,'wio','idle']
CPU.info()
CPU.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 27 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   time      8192 non-null   object 
 1   lread     8192 non-null   int64  
 2   lwrite    8192 non-null   int64  
 3   scall     8192 non-null   int64  
 4   sread     8192 non-null   int64  
 5   swrite    8192 non-null   int64  
 6   fork      8192 non-null   float64
 7   exec      8192 non-null   float64
 8   rchar     8192 non-null   int64  
 9   wchar     8192 non-null   int64  
 10  pgout     8192 non-null   float64
 11  ppgout    8192 non-null   float64
 12  pgfree    8192 non-null   float64
 13  pgscan    8192 non-null   float64
 14  atch      8192 non-null   float64
 15  pgin      8192 non-null   float64
 16  ppgin     8192 non-null   float64
 17  pflt      8192 non-null   float64
 18  vflt      8192 non-null   float64
 19  runqsz    8192 non-null   float64
 20  runocc    8192 non-null   int64  
 21  freemem   8192 non-null   int64  
 22  freeswap  8192 non-null   int64  
 23  usr       8192 non-null   int64  
 24  sys       8192 non-null   int64  
 25  wio       7898 non-null   float64
 26  idle      7898 non-null   float64
dtypes: float64(14), int64(12), object(1)
memory usage: 1.7+ MB
Out[10]:
time lread lwrite scall sread swrite fork exec rchar wchar ... pflt vflt runqsz runocc freemem freeswap usr sys wio idle
0 11:30:07 44 0 3700 410 200 3.39 5.79 544367 247202 ... 618.16 427.15 2.0 20 180 1118221 75 25 0.0 0.0
1 15:44:35 7 0 3459 373 391 5.60 1.60 237571 409952 ... 268.00 417.20 2.2 100 178 1374995 79 21 0.0 0.0
2 11:09:06 7 4 4627 1283 136 2.00 3.59 509700 50768 ... 176.85 270.66 1.8 100 243 1130699 79 21 0.0 0.0
3 15:44:09 7 0 4197 627 498 5.59 2.00 274637 141793 ... 280.24 496.81 1.8 80 161 980552 77 23 0.0 0.0
4 14:00:10 3 1 4821 236 258 2.00 9.38 83870 479867 ... 74.45 163.87 2.0 100 963 1037206 80 19 0.0 0.0

5 rows Ɨ 27 columns

Clean data

Census

In [11]:
Census_clean=Census_Dataset.drop(['State','Code','H23.A','H23.C','H24','P4.1','P4.2','P4.3','P4.4','H4.1','H4.2','H4.3','H4.4','H35.1'], axis = 1)
Census_clean[np.isnan(Census_clean)] = 0 #delete nan
Census_clean[np.isinf(Census_clean)] = 0 #delete inf
Census_clean.info()
Census_clean.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22784 entries, 0 to 22783
Columns: 124 entries, P1 to H40.4
dtypes: float64(116), int64(8)
memory usage: 21.6 MB
Out[11]:
P1 P2 P3 P5.1 P5.2 P6.1 P6.2 P6.3 P6.4 P6.5 ... H35.3 H35.4 H35.5 H37.1 H37.2 H38 H40.1 H40.2 H40.3 H40.4
0 3173 852 1214 0.460763 0.539237 0.642609 0.351403 0.001891 0.000315 0.003782 ... 119.584000 287.0 0.000000 0.0 0.0 144.125000 0.125000 0.500000 0.363636 0.454545
1 468 120 182 0.457265 0.542735 0.207265 0.792735 0.000000 0.000000 0.000000 ... 78.947368 0.0 0.000000 0.0 0.0 50.000000 0.000000 1.000000 0.166667 0.666667
2 14732 4154 4921 0.487442 0.512558 0.884605 0.109761 0.002036 0.003258 0.000339 ... 183.153846 213.0 329.000000 0.0 384.5 335.232877 0.356164 0.095890 0.340000 0.160000
3 14507 4248 5838 0.461915 0.538085 0.983456 0.012546 0.002275 0.001517 0.000207 ... 200.259259 236.0 295.666667 0.0 232.5 264.455696 0.563291 0.107595 0.169491 0.483051
4 14917 4152 5745 0.455252 0.544748 0.716230 0.280485 0.001073 0.001810 0.000402 ... 163.479532 237.0 207.800000 0.0 113.5 196.174194 0.458599 0.159236 0.153846 0.461539

5 rows Ɨ 124 columns

Adult

In [12]:
adult_clean=adult[adult.applymap(lambda x: str(x).strip() == '?').apply(np.sum, axis=1) == 0].reset_index(drop=True)# Delete '?'
adult_test_clean=adult_test[adult_test.applymap(lambda x: str(x).strip() == '?').apply(np.sum, axis=1) == 0].reset_index(drop=True)
combine_adultclean=pd.concat([adult_clean,adult_test_clean],axis=0).drop(['Education'], axis = 1)#Data combination

One_hot_encoding

In [13]:
Onehotc_adult_clean=pd.get_dummies(combine_adultclean,dtype='int8').drop(['Income_<=50K'], axis = 1)
Onehotc_adult_clean['Income_>50K']=Onehotc_adult_clean['Income_>50K'].replace(0, -1)
Onehotc_adult_clean.head()
Out[13]:
Age Weight Education Num C-Gain C-Loss Hours Workclass_Federal-gov Workclass_Local-gov Workclass_Private Workclass_Self-emp-inc ... Country_Puerto-Rico Country_Scotland Country_South Country_Taiwan Country_Thailand Country_Trinadad&Tobago Country_United-States Country_Vietnam Country_Yugoslavia Income_>50K
0 39 77516 13 2174 0 40 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
1 50 83311 13 0 0 13 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
2 38 215646 9 0 0 40 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
3 53 234721 7 0 0 40 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
4 28 338409 13 0 0 40 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 -1

5 rows Ɨ 89 columns

CPU

In [14]:
CPU_clean=CPU.drop(['time'], axis = 1)
CPU_clean[np.isnan(CPU_clean)] = 0 #delete nan
CPU_clean[np.isinf(CPU_clean)] = 0 #delete inf
CPU_clean.info()
CPU_clean.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   lread     8192 non-null   int64  
 1   lwrite    8192 non-null   int64  
 2   scall     8192 non-null   int64  
 3   sread     8192 non-null   int64  
 4   swrite    8192 non-null   int64  
 5   fork      8192 non-null   float64
 6   exec      8192 non-null   float64
 7   rchar     8192 non-null   int64  
 8   wchar     8192 non-null   int64  
 9   pgout     8192 non-null   float64
 10  ppgout    8192 non-null   float64
 11  pgfree    8192 non-null   float64
 12  pgscan    8192 non-null   float64
 13  atch      8192 non-null   float64
 14  pgin      8192 non-null   float64
 15  ppgin     8192 non-null   float64
 16  pflt      8192 non-null   float64
 17  vflt      8192 non-null   float64
 18  runqsz    8192 non-null   float64
 19  runocc    8192 non-null   int64  
 20  freemem   8192 non-null   int64  
 21  freeswap  8192 non-null   int64  
 22  usr       8192 non-null   int64  
 23  sys       8192 non-null   int64  
 24  wio       8192 non-null   float64
 25  idle      8192 non-null   float64
dtypes: float64(14), int64(12)
memory usage: 1.6 MB
Out[14]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pflt vflt runqsz runocc freemem freeswap usr sys wio idle
0 44 0 3700 410 200 3.39 5.79 544367 247202 2.99 ... 618.16 427.15 2.0 20 180 1118221 75 25 0.0 0.0
1 7 0 3459 373 391 5.60 1.60 237571 409952 2.00 ... 268.00 417.20 2.2 100 178 1374995 79 21 0.0 0.0
2 7 4 4627 1283 136 2.00 3.59 509700 50768 8.78 ... 176.85 270.66 1.8 100 243 1130699 79 21 0.0 0.0
3 7 0 4197 627 498 5.59 2.00 274637 141793 3.19 ... 280.24 496.81 1.8 80 161 980552 77 23 0.0 0.0
4 3 1 4821 236 258 2.00 9.38 83870 479867 0.00 ... 74.45 163.87 2.0 100 963 1037206 80 19 0.0 0.0

5 rows Ɨ 26 columns

In [ ]:
 

Feature scaling

Census

min-max normalization

In [15]:
from sklearn.preprocessing import MinMaxScaler
Census_clean.iloc[:, :]=MinMaxScaler().fit_transform(Census_clean.iloc[:, :])
display(Census_clean.mean())
display(Census_clean.var())
Census_clean.head()
P1       0.001066
P2       0.001139
P3       0.001041
P5.1     0.447698
P5.2     0.552302
           ...   
H38      0.218675
H40.1    0.276650
H40.2    0.298252
H40.3    0.123569
H40.4    0.491626
Length: 124, dtype: float64
P1       0.000081
P2       0.000080
P3       0.000078
P5.1     0.001595
P5.2     0.001595
           ...   
H38      0.028447
H40.1    0.070101
H40.2    0.083180
H40.3    0.032800
H40.4    0.109995
Length: 124, dtype: float64
Out[15]:
P1 P2 P3 P5.1 P5.2 P6.1 P6.2 P6.3 P6.4 P6.5 ... H35.3 H35.4 H35.5 H37.1 H37.2 H38 H40.1 H40.2 H40.3 H40.4
0 0.000433 0.000491 0.000430 0.420715 0.579285 0.642609 0.351403 0.001891 0.000352 0.004221 ... 0.095667 0.2296 0.000000 0.0 0.0000 0.115300 0.125000 0.500000 0.363636 0.454545
1 0.000064 0.000069 0.000064 0.416332 0.583668 0.207265 0.792735 0.000000 0.000000 0.000000 ... 0.063158 0.0000 0.000000 0.0 0.0000 0.040000 0.000000 1.000000 0.166667 0.666667
2 0.002012 0.002394 0.001745 0.454145 0.545855 0.884605 0.109761 0.002036 0.003643 0.000379 ... 0.146523 0.1704 0.263200 0.0 0.3076 0.268186 0.356164 0.095890 0.340000 0.160000
3 0.001981 0.002448 0.002070 0.422158 0.577842 0.983456 0.012546 0.002275 0.001695 0.000231 ... 0.160207 0.1888 0.236533 0.0 0.1860 0.211565 0.563291 0.107595 0.169491 0.483051
4 0.002037 0.002393 0.002037 0.413810 0.586190 0.716230 0.280485 0.001073 0.002024 0.000449 ... 0.130784 0.1896 0.166240 0.0 0.0908 0.156939 0.458599 0.159236 0.153846 0.461539

5 rows Ɨ 124 columns

Adult

min-max normalization

In [16]:
from sklearn.preprocessing import MinMaxScaler
Onehotc_adult_clean.iloc[:, :6]=MinMaxScaler().fit_transform(Onehotc_adult_clean.iloc[:, :6])
display(Onehotc_adult_clean.iloc[:, :6].mean())
display(Onehotc_adult_clean.iloc[:, :6].var())
Onehotc_adult_clean.head()
Age              0.295177
Weight           0.119332
Education Num    0.607897
C-Gain           0.011014
C-Loss           0.020339
Hours            0.407531
dtype: float64
Age              0.032785
Weight           0.005116
Education Num    0.028965
C-Gain           0.005635
C-Loss           0.008643
Hours            0.015013
dtype: float64
Out[16]:
Age Weight Education Num C-Gain C-Loss Hours Workclass_Federal-gov Workclass_Local-gov Workclass_Private Workclass_Self-emp-inc ... Country_Puerto-Rico Country_Scotland Country_South Country_Taiwan Country_Thailand Country_Trinadad&Tobago Country_United-States Country_Vietnam Country_Yugoslavia Income_>50K
0 0.301370 0.043350 0.800000 0.02174 0.0 0.397959 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
1 0.452055 0.047274 0.800000 0.00000 0.0 0.122449 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
2 0.287671 0.136877 0.533333 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
3 0.493151 0.149792 0.400000 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
4 0.150685 0.219998 0.800000 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 -1

5 rows Ɨ 89 columns

CPU

min-max normalization

In [17]:
from sklearn.preprocessing import MinMaxScaler
CPU_clean.iloc[:, :]=MinMaxScaler().fit_transform(CPU_clean.iloc[:, :])
display(CPU_clean.mean())
display(CPU_clean.var())
CPU_clean.head()
lread       0.010601
lwrite      0.022793
scall       0.177432
sread       0.038494
swrite      0.026254
fork        0.093666
exec        0.046877
rchar       0.077873
wchar       0.052441
pgout       0.028061
ppgout      0.032450
pgfree      0.022791
pgscan      0.017402
atch        0.005329
pgin        0.058626
ppgin       0.042338
pflt        0.122020
vflt        0.135636
runqsz      0.006602
runocc      0.035251
freemem     0.142704
freeswap    0.592071
usr         0.848170
sys         0.229458
wio         0.002747
idle        0.012899
dtype: float64
lread       0.000836
lwrite      0.002703
scall       0.017401
sread       0.001403
swrite      0.000867
fork        0.015187
exec        0.007659
rchar       0.008986
wchar       0.006114
pgout       0.004246
ppgout      0.006822
pgfree      0.003829
pgscan      0.003308
atch        0.000728
pgin        0.009656
ppgin       0.005798
pflt        0.016170
vflt        0.019585
runqsz      0.001985
runocc      0.033261
freemem     0.042984
freeswap    0.035394
usr         0.034551
sys         0.027848
wio         0.001488
idle        0.005219
dtype: float64
Out[17]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pflt vflt runqsz runocc freemem freeswap usr sys wio idle
0 0.023848 0.000000 0.289971 0.076054 0.035419 0.168489 0.097213 0.215364 0.136493 0.036714 ... 0.686997 0.312830 0.000354 0.000001 0.010441 0.498496 0.757576 0.462963 0.0 0.0
1 0.003794 0.000000 0.270510 0.069089 0.070472 0.278330 0.026864 0.093926 0.226903 0.024558 ... 0.297844 0.305539 0.000425 0.000084 0.010274 0.612965 0.797980 0.388889 0.0 0.0
2 0.003794 0.006957 0.364826 0.240399 0.023674 0.099404 0.060275 0.201642 0.027370 0.107809 ... 0.196544 0.198168 0.000283 0.000084 0.015703 0.504059 0.797980 0.388889 0.0 0.0
3 0.003794 0.000000 0.330103 0.116905 0.090108 0.277833 0.033580 0.108598 0.077936 0.039170 ... 0.311447 0.363870 0.000283 0.000063 0.008854 0.437124 0.777778 0.425926 0.0 0.0
4 0.001626 0.001739 0.380491 0.043298 0.046063 0.099404 0.157488 0.033088 0.265742 0.000000 ... 0.082741 0.119922 0.000354 0.000084 0.075844 0.462380 0.808081 0.351852 0.0 0.0

5 rows Ɨ 26 columns

Data visualization

Census

In [18]:
Censussample=Census_Dataset.drop_duplicates(['State']).sort_values(by=['State'])
data = pd.read_excel('C:\\Users\\sunjin\\Desktop\\MUST\\MUST—Course—Python tool\\Python presentation\\python data\\test2.xlsx') 
data=data.drop_duplicates(['fips']).sort_values(by=['fips'])
data  = data .drop(data [data .fips == 3].index)
data=data.drop(data.index[[-1, -2, -3,-4 ,8]], axis=0)

statename=list(data['state'])
HUprice=list(Censussample['H23.B'])
list1 = [[statename[i],HUprice[i]] for i in range(len(statename))]
map_1 = Map(init_opts=opts.InitOpts(width="1500px", height="600px"))
map_1.add("U.S. House Prices", list1, maptype="ē¾Žå›½") 
map_1.set_global_opts( 
    visualmap_opts=opts.VisualMapOpts(min_=0,max_=250000, is_piecewise=True, split_number=15),
    legend_opts=opts.LegendOpts(is_show=True),
    )
map_1.render_notebook() 
Out[18]:
In [19]:
county_fips= pd.read_excel('C:\\Users\\sunjin\\Desktop\\fips-codes-master\\county_fips_master.xlsx')
county_fips=county_fips.iloc[:,[0,8]]
county_fips['state'] = county_fips['state'].fillna(0).astype(np.int64)
county_fips2=county_fips.drop_duplicates(['state']).sort_values(by=['state'])
county_fips2=county_fips2.drop(county_fips2 [county_fips2.state == 0].index)
county_fips2=county_fips2.drop(county_fips2 [county_fips2.state == 11].index)

# County_fips
index= []
for i in list(county_fips2['state']):
    index.append(county_fips2[county_fips2.state == i].index.tolist()[0])
#
diffs = [y - x for x, y in zip(index, index[1:])]
diffs=diffs+[23]

# Census_Dataset
Census_Dataset[np.isnan(Census_Dataset)] = 0 #delete nan
Census_Dataset[np.isinf(Census_Dataset)] = 0 #delete inf
Censussample2=Census_Dataset.iloc[:,[0,111]]

index2= []
for(i,j) in zip(list(county_fips2['state']),diffs):
    index2.append(Censussample2[Censussample2.State == i].head(j))
    
# Generate list of house price
df = pd.concat( [index2[0], index2[1], index2[2], index2[3],index2[4], index2[5], index2[6], index2[7],index2[8], index2[9], index2[10], index2[11],index2[12], index2[13], index2[14], index2[15],index2[16], index2[17], index2[18], index2[19],index2[20], index2[21], index2[22], index2[23],index2[24], index2[25], index2[26], index2[27],index2[28], index2[29],index2[30], index2[31], index2[32], index2[33],index2[34], index2[35], index2[36], index2[37],index2[38], index2[39],index2[40], index2[41], index2[42], index2[43],index2[44], index2[45], index2[46], index2[47], index2[48], index2[49]], axis=0 )

# Pre—operation for house price maps in USA
fipsname=list(county_fips['fips'])
HUprice2=list(df['H23.B'])

# House price in USA
fig = ff.create_choropleth(fips=fipsname, values=HUprice2, county_outline={'color': 'rgb(255,255,255)', 'width': 0.5})
fig.show()

Adult

In [20]:
# Show the relationship between Income, Sex and Age
plt.figure(figsize=(20, 10))
df = combine_adultclean
p1=sns.violinplot(x="Income", y="Age", hue='Sex', data=df, palette="Pastel1")
plt.ylabel("Age",fontsize=30,fontstyle="normal",fontweight='black')
plt.xlabel("Income",fontsize=30,fontstyle="normal",fontweight='black')
plt.tick_params(labelsize=20)
plt.show()
In [21]:
# Show the relationship between Income and Education Num
plt.figure(figsize=(20, 10))
sns.set(color_codes=True)
sns.set_style("white")
sns.violinplot( x=df["Income"], y=df["Education Num"] )
plt.ylabel("Education Num",fontsize=30,fontstyle="normal",fontweight='black')
plt.xlabel("Income",fontsize=30,fontstyle="normal",fontweight='black')
plt.tick_params(labelsize=20)
In [ ]:
 

CPU

In [22]:
CPU_clean
Out[22]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pflt vflt runqsz runocc freemem freeswap usr sys wio idle
0 0.023848 0.000000 0.289971 0.076054 0.035419 0.168489 0.097213 0.215364 0.136493 0.036714 ... 0.686997 0.312830 0.000354 0.000001 0.010441 0.498496 0.757576 0.462963 0.0 0.0
1 0.003794 0.000000 0.270510 0.069089 0.070472 0.278330 0.026864 0.093926 0.226903 0.024558 ... 0.297844 0.305539 0.000425 0.000084 0.010274 0.612965 0.797980 0.388889 0.0 0.0
2 0.003794 0.006957 0.364826 0.240399 0.023674 0.099404 0.060275 0.201642 0.027370 0.107809 ... 0.196544 0.198168 0.000283 0.000084 0.015703 0.504059 0.797980 0.388889 0.0 0.0
3 0.003794 0.000000 0.330103 0.116905 0.090108 0.277833 0.033580 0.108598 0.077936 0.039170 ... 0.311447 0.363870 0.000283 0.000063 0.008854 0.437124 0.777778 0.425926 0.0 0.0
4 0.001626 0.001739 0.380491 0.043298 0.046063 0.099404 0.157488 0.033088 0.265742 0.000000 ... 0.082741 0.119922 0.000354 0.000084 0.075844 0.462380 0.808081 0.351852 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8187 0.001084 0.001739 0.062904 0.008660 0.009543 0.009940 0.003358 0.093812 0.159205 0.000000 ... 0.015526 0.016962 0.000354 0.000084 0.359171 0.810591 0.969697 0.074074 0.0 0.0
8188 0.019512 0.099130 0.027778 0.006777 0.004955 0.029821 0.010074 0.010750 0.009897 0.000000 ... 0.130696 0.030627 0.000638 0.000084 0.080521 0.765611 0.959596 0.092593 0.0 0.0
8189 0.013008 0.012174 0.466005 0.231551 0.205726 0.318091 0.100739 0.515019 0.492526 0.000000 ... 0.329851 0.375586 0.000000 0.000001 0.055630 0.471263 0.666667 0.629630 0.0 0.0
8190 0.007588 0.008696 0.289163 0.033321 0.023307 0.029821 0.010074 0.060395 0.018030 0.000000 ... 0.058013 0.089097 0.000780 0.000084 0.052790 0.681508 0.898990 0.203704 0.0 0.0
8191 0.029810 0.125217 0.153666 0.027673 0.020738 0.039761 0.040296 0.010504 0.014336 0.000000 ... 0.065348 0.041618 0.000425 0.000084 0.098313 0.434781 0.898990 0.203704 0.0 0.0

8192 rows Ɨ 26 columns

In [23]:
import statsmodels.api as sm 
CPU_cleanl=CPU_clean.drop(['usr'], axis = 1)
CPU_clean_x = sm.add_constant(CPU_cleanl.iloc[:,:])
CPU_clean_y = CPU_clean.iloc[:,[22]] 
model = sm.OLS(CPU_clean_y,CPU_clean_x)
result = model.fit() 
result.summary()
Out[23]:
OLS Regression Results
Dep. Variable: usr R-squared: 1.000
Model: OLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 2.107e+06
Date: Sat, 08 May 2021 Prob (F-statistic): 0.00
Time: 23:12:51 Log-Likelihood: 38091.
No. Observations: 8192 AIC: -7.613e+04
Df Residuals: 8166 BIC: -7.595e+04
Df Model: 25
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 1.0090 0.000 6020.739 0.000 1.009 1.009
lread -0.0008 0.001 -0.715 0.475 -0.003 0.001
lwrite -0.0003 0.001 -0.431 0.666 -0.001 0.001
scall 0.0017 0.000 4.759 0.000 0.001 0.002
sread -0.0022 0.002 -1.221 0.222 -0.006 0.001
swrite 5.166e-05 0.002 0.025 0.980 -0.004 0.004
fork 0.0002 0.001 0.273 0.785 -0.001 0.002
exec -0.0015 0.001 -2.679 0.007 -0.003 -0.000
rchar -0.0005 0.000 -1.284 0.199 -0.001 0.000
wchar -0.0009 0.000 -2.244 0.025 -0.002 -0.000
pgout 0.0012 0.001 1.341 0.180 -0.001 0.003
ppgout -0.0011 0.001 -0.915 0.360 -0.003 0.001
pgfree 0.0014 0.002 0.816 0.414 -0.002 0.005
pgscan -0.0015 0.001 -1.229 0.219 -0.004 0.001
atch -0.0055 0.001 -5.451 0.000 -0.007 -0.004
pgin 0.0011 0.001 1.471 0.141 -0.000 0.002
ppgin 0.0005 0.001 0.529 0.597 -0.001 0.002
pflt 0.0009 0.001 1.333 0.183 -0.000 0.002
vflt -0.0004 0.001 -0.484 0.629 -0.002 0.001
runqsz 0.0244 0.001 28.552 0.000 0.023 0.026
runocc -1.0314 0.000 -3740.024 0.000 -1.032 -1.031
freemem -0.0001 0.000 -0.824 0.410 -0.000 0.000
freeswap 0.0017 0.000 6.771 0.000 0.001 0.002
sys -0.5464 0.000 -1132.405 0.000 -0.547 -0.545
wio -0.0176 0.001 -26.178 0.000 -0.019 -0.016
idle -0.0253 0.000 -67.119 0.000 -0.026 -0.025
Omnibus: 2985.314 Durbin-Watson: 1.977
Prob(Omnibus): 0.000 Jarque-Bera (JB): 303886.892
Skew: -0.778 Prob(JB): 0.00
Kurtosis: 32.797 Cond. No. 125.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
 

Divide data set

In [24]:
%%dmind markdown right

#Filename+Function
## Function
### transfer_data: change data into libsvm format
### generate_data_file: change data into svmlight format
## Filename
### Census_clean+(Move'H23.B'to the last column)=Census_clean
### Census_clean+(dataset division)=Census_clean_traind / Census_clean_testd
### Census_clean_traind/Census_clean_testd+(transfer_data)=Censustrain_x_array / Censustrain_y
### Census_clean_traind+(generate_data_file)=CPU_clean_traind_svmlight.txt
Out[24]:

Census

In [25]:
# Define the function(change type of dataset into LibSVM format)
def transfer_data(train_set,test_set):
    train_x = train_set.iloc[:,:-1]
    train_x_array = np.array(train_x)
    
    train_y = np.array(train_set.iloc[:,-1])
    train_y.shape = (len(train_y))
    
    test_x = test_set.iloc[:,:-1]
    test_x_array = np.array(test_x)
    
    test_y = np.array(test_set.iloc[:,-1])
    test_y.shape = (len(test_y))
    
    return (train_x_array, train_y, test_x_array, test_y)
In [26]:
def generate_data_file(df, path):
    txt = ''
    for i in df.values:
        
        txt += str(i[-1]) + ' '
 
        for f, v in enumerate(i[:-1], start=1):
            txt += f'{f}:{v} '
        txt += '\n'
    print('The number of characters written in %s:' % path)
    print(open(path, 'w').write(txt))
# The new document position: C:\Users\sunjin\svmlight-loader
In [27]:
Census_clean.head()
Out[27]:
P1 P2 P3 P5.1 P5.2 P6.1 P6.2 P6.3 P6.4 P6.5 ... H35.3 H35.4 H35.5 H37.1 H37.2 H38 H40.1 H40.2 H40.3 H40.4
0 0.000433 0.000491 0.000430 0.420715 0.579285 0.642609 0.351403 0.001891 0.000352 0.004221 ... 0.095667 0.2296 0.000000 0.0 0.0000 0.115300 0.125000 0.500000 0.363636 0.454545
1 0.000064 0.000069 0.000064 0.416332 0.583668 0.207265 0.792735 0.000000 0.000000 0.000000 ... 0.063158 0.0000 0.000000 0.0 0.0000 0.040000 0.000000 1.000000 0.166667 0.666667
2 0.002012 0.002394 0.001745 0.454145 0.545855 0.884605 0.109761 0.002036 0.003643 0.000379 ... 0.146523 0.1704 0.263200 0.0 0.3076 0.268186 0.356164 0.095890 0.340000 0.160000
3 0.001981 0.002448 0.002070 0.422158 0.577842 0.983456 0.012546 0.002275 0.001695 0.000231 ... 0.160207 0.1888 0.236533 0.0 0.1860 0.211565 0.563291 0.107595 0.169491 0.483051
4 0.002037 0.002393 0.002037 0.413810 0.586190 0.716230 0.280485 0.001073 0.002024 0.000449 ... 0.130784 0.1896 0.166240 0.0 0.0908 0.156939 0.458599 0.159236 0.153846 0.461539

5 rows Ɨ 124 columns

In [28]:
# Move'H23.B'to the last column
last_col2 = Census_clean.pop(Census_clean.columns[100])
Census_clean=pd.concat( (Census_clean, last_col2.to_frame()),axis=1)
Census_clean.head()
Out[28]:
P1 P2 P3 P5.1 P5.2 P6.1 P6.2 P6.3 P6.4 P6.5 ... H35.4 H35.5 H37.1 H37.2 H38 H40.1 H40.2 H40.3 H40.4 H23.B
0 0.000433 0.000491 0.000430 0.420715 0.579285 0.642609 0.351403 0.001891 0.000352 0.004221 ... 0.2296 0.000000 0.0 0.0000 0.115300 0.125000 0.500000 0.363636 0.454545 0.056600
1 0.000064 0.000069 0.000064 0.416332 0.583668 0.207265 0.792735 0.000000 0.000000 0.000000 ... 0.0000 0.000000 0.0 0.0000 0.040000 0.000000 1.000000 0.166667 0.666667 0.029998
2 0.002012 0.002394 0.001745 0.454145 0.545855 0.884605 0.109761 0.002036 0.003643 0.000379 ... 0.1704 0.263200 0.0 0.3076 0.268186 0.356164 0.095890 0.340000 0.160000 0.122800
3 0.001981 0.002448 0.002070 0.422158 0.577842 0.983456 0.012546 0.002275 0.001695 0.000231 ... 0.1888 0.236533 0.0 0.1860 0.211565 0.563291 0.107595 0.169491 0.483051 0.067800
4 0.002037 0.002393 0.002037 0.413810 0.586190 0.716230 0.280485 0.001073 0.002024 0.000449 ... 0.1896 0.166240 0.0 0.0908 0.156939 0.458599 0.159236 0.153846 0.461539 0.066600

5 rows Ɨ 124 columns

In [29]:
# Divide the dataset
from sklearn.model_selection import train_test_split
#data:dataset need to divided
#random_state:define the random seed to insure generate same random numbers
#test_size:proportion of testset
Census_clean_traind, Census_clean_testd = train_test_split(Census_clean, test_size=0.2, random_state=42)

# Generate dataset for Libsvm
(Censustrain_x_array, Censustrain_y, Censustest_x_array, Censustest_y)= transfer_data(Census_clean_traind,Census_clean_testd)

# Generate dataset for Svmlight and save as ###.txt in the document——C:\Users\sunjin\svmlight-loader
generate_data_file(Census_clean_traind,"Census_clean_traind_svmlight.txt")
generate_data_file(Census_clean_testd, "Census_clean_testd_svmlight.txt")
The number of characters written in Census_clean_traind_svmlight.txt:
36984172
The number of characters written in Census_clean_testd_svmlight.txt:
9252787
In [ ]:
 

Adult

In [30]:
Onehotc_adult_clean.head()
Out[30]:
Age Weight Education Num C-Gain C-Loss Hours Workclass_Federal-gov Workclass_Local-gov Workclass_Private Workclass_Self-emp-inc ... Country_Puerto-Rico Country_Scotland Country_South Country_Taiwan Country_Thailand Country_Trinadad&Tobago Country_United-States Country_Vietnam Country_Yugoslavia Income_>50K
0 0.301370 0.043350 0.800000 0.02174 0.0 0.397959 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
1 0.452055 0.047274 0.800000 0.00000 0.0 0.122449 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 -1
2 0.287671 0.136877 0.533333 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
3 0.493151 0.149792 0.400000 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 1 0 0 -1
4 0.150685 0.219998 0.800000 0.00000 0.0 0.397959 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 -1

5 rows Ɨ 89 columns

In [31]:
# Divide the adult dataset
d=30162
adult_traind = Onehotc_adult_clean.iloc[:d]
adult_testd = Onehotc_adult_clean.iloc[d:]

# Generate dataset for Libsvm
(Adulttrain_x_array, Adulttrain_y, Adulttest_x_array, Adulttest_y)= transfer_data(adult_traind,adult_testd)
# The Adultdataset for Svmlight has already been created in the group project!

CPU

In [32]:
# Moveā€˜usr’to the last column
last_col = CPU_clean.pop(CPU_clean.columns[22])
CPU_clean=pd.concat( (CPU_clean, last_col.to_frame()),axis=1)
CPU_clean.head()
Out[32]:
lread lwrite scall sread swrite fork exec rchar wchar pgout ... pflt vflt runqsz runocc freemem freeswap sys wio idle usr
0 0.023848 0.000000 0.289971 0.076054 0.035419 0.168489 0.097213 0.215364 0.136493 0.036714 ... 0.686997 0.312830 0.000354 0.000001 0.010441 0.498496 0.462963 0.0 0.0 0.757576
1 0.003794 0.000000 0.270510 0.069089 0.070472 0.278330 0.026864 0.093926 0.226903 0.024558 ... 0.297844 0.305539 0.000425 0.000084 0.010274 0.612965 0.388889 0.0 0.0 0.797980
2 0.003794 0.006957 0.364826 0.240399 0.023674 0.099404 0.060275 0.201642 0.027370 0.107809 ... 0.196544 0.198168 0.000283 0.000084 0.015703 0.504059 0.388889 0.0 0.0 0.797980
3 0.003794 0.000000 0.330103 0.116905 0.090108 0.277833 0.033580 0.108598 0.077936 0.039170 ... 0.311447 0.363870 0.000283 0.000063 0.008854 0.437124 0.425926 0.0 0.0 0.777778
4 0.001626 0.001739 0.380491 0.043298 0.046063 0.099404 0.157488 0.033088 0.265742 0.000000 ... 0.082741 0.119922 0.000354 0.000084 0.075844 0.462380 0.351852 0.0 0.0 0.808081

5 rows Ɨ 26 columns

In [33]:
# Divide the dataset
from sklearn.model_selection import train_test_split
#data:dataset need to divided
#random_state:define the random seed to insure generate same random numbers
#test_size:proportion of testset
CPU_clean_traind, CPU_clean_testd = train_test_split(CPU_clean, test_size=0.2, random_state=42)

# Generate dataset for Libsvm
(CPUtrain_x_array, CPUtrain_y, CPUtest_x_array, CPUtest_y)= transfer_data(CPU_clean_traind,CPU_clean_testd)

# Generate dataset for Svmlight and save as ###.txt in the document——C:\Users\sunjin\svmlight-loader
generate_data_file(CPU_clean_traind,"CPU_clean_traind_svmlight.txt")
generate_data_file(CPU_clean_testd, "CPU_clean_testd_svmlight.txt")
The number of characters written in CPU_clean_traind_svmlight.txt:
3255809
The number of characters written in CPU_clean_testd_svmlight.txt:
815044
In [ ]:
 

Normal_Liblinear+Libsvm+svmLight

Census

Normal Liblinear

In [34]:
from liblinear.liblinearutil import train, predict
m_liblinearCensus = train(Censustrain_y, Censustrain_x_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Censustrain_y, Censustrain_x_array, m_liblinearCensus)
Mean squared error = 0.00174747 (regression)
Squared correlation coefficient = 0.943396 (regression)
In [35]:
p_test_label, p_test_acc, p_test_val = predict(Censustest_y, Censustest_x_array, m_liblinearCensus)
Mean squared error = 0.00180537 (regression)
Squared correlation coefficient = 0.93594 (regression)
In [36]:
import time
start = time.process_time()
train(Censustrain_y, Censustrain_x_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
5.859375
In [37]:
print('The Mean squared error of Census(train set) by using Normal Liblinear is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Normal Liblinear is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of Census(test set) by using Normal Liblinear is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Normal Liblinear is '+str(p_test_acc[2])+' (regression)')
print('The time of Census by using Normal Liblinear is '+str(end-start)+' seconds')
The Mean squared error of Census(train set) by using Normal Liblinear is 0.001747468662487266 (regression)
The Squared correlation coefficient of Census(train set) by using Normal Liblinear is 0.9433960218272639 (regression)
The Mean squared error of Census(test set) by using Normal Liblinear is 0.0018053702406047388 (regression)
The Squared correlation coefficient of Census(test set) by using Normal Liblinear is 0.9359400707973 (regression)
The time of Census by using Normal Liblinear is 5.859375 seconds

LibSVM - Gaussian Kernel

In [38]:
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmLRCensus_gaussian = svm_train(Censustrain_y, Censustrain_x_array, '-s 3 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(Censustrain_y, Censustrain_x_array, m_libsvmLRCensus_gaussian)
Mean squared error = 0.00301837 (regression)
Squared correlation coefficient = 0.953324 (regression)
In [39]:
p_test_label, p_test_acc, p_test_val = svm_predict(Censustest_y, Censustest_x_array, m_libsvmLRCensus_gaussian)
Mean squared error = 0.00311868 (regression)
Squared correlation coefficient = 0.94521 (regression)
In [40]:
start = time.process_time()
svm_train(Censustrain_y, Censustrain_x_array, '-s 3 -t 2 -c 4')
end = time.process_time()
print (end-start)
5.0625
In [41]:
print('The Mean squared error of Census(train set) by using Libsvm- Gaussian Kernel is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Libsvm- Gaussian Kernel is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of Census(test set) by using Libsvm- Gaussian Kernel is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Libsvm- Gaussian Kernel is '+str(p_test_acc[2])+' (regression)')
print('The time of Census by using Libsvm- Gaussian Kernel is '+str(end-start)+' seconds')
The Mean squared error of Census(train set) by using Libsvm- Gaussian Kernel is 0.003018371579739891 (regression)
The Squared correlation coefficient of Census(train set) by using Libsvm- Gaussian Kernel is 0.953323587844041 (regression)
The Mean squared error of Census(test set) by using Libsvm- Gaussian Kernel is 0.003118678952781748 (regression)
The Squared correlation coefficient of Census(test set) by using Libsvm- Gaussian Kernel is 0.9452104488172001 (regression)
The time of Census by using Libsvm- Gaussian Kernel is 5.0625 seconds

Svmlight- Gaussian Kernel

In [42]:
# Powershell code:
## ./svm_learn.exe -z r -t 2 -c 4 Census/Census_clean_traind_svmlight.txt Census/model
## ./svm_classify.exe Census/Census_clean_traind_svmlight.txt Census/model Census/predicidons
In [43]:
# Read the prediction txt file save as Census_clean_traind_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\Census\\Census_clean_traind_predict.txt", "r") as f:
    Census_clean_traind_predict = f.readlines()

# Delete the ā€œ\nā€ in the list and change str into float format
Census_clean_traind_predictnew=[]
for i in range(len(Census_clean_traind_predict)):
    Census_clean_traind_predictnew.append(float(Census_clean_traind_predict[i].strip('\n')))

# Compute MSE by using svmlight
mc=[Census_clean_traind_predictnew[i]-list(Census_clean_traind['H23.B'])[i] for i in range(0,len(Census_clean_traind_predictnew))]
for i in range(len(mc)):
    mc[i] =mc[i]**2
CensussvmlightMSE=sum(mc)/len(Census_clean_traind_predictnew)

## Compute R_square by using svmlight(train set)
ssrc=[list(Census_clean_traind['H23.B'])[i]-Census_clean_traind_predictnew[i] for i in range(len(Census_clean_traind_predictnew))]
for i in range(len(ssrc)):
    ssrc[i] =ssrc[i]**2
CensussvmlighttraindSSR=sum(ssrc)

sstc=[list(Census_clean_traind['H23.B'])[i]-np.mean(list(Census_clean_traind['H23.B'])) for i in range(len(list(Census_clean_traind['H23.B'])))]
for i in range(len(sstc)):
    sstc[i] =sstc[i]**2
CensussvmlighttraindSST=sum(sstc)

CensussvmlighttraindR_square=1-(CensussvmlighttraindSSR/CensussvmlighttraindSST)
In [44]:
# Powershell code:
## ./svm_classify.exe Census/Census_clean_testd_svmlight.txt Census/model Census/predicidons2
In [45]:
del mc
del ssrc
del sstc
# Read the prediction txt file save as Census_clean_testd_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\Census\\Census_clean_testd_predict.txt", "r") as f:
    Census_clean_testd_predict = f.readlines()

# Delete the ā€œ\nā€ in the list and change str into float format
Census_clean_testd_predictnew=[]
for i in range(len(Census_clean_testd_predict)):
    Census_clean_testd_predictnew.append(float(Census_clean_testd_predict[i].strip('\n')))

# Compute MSE by using svmlight(testd)
mc=[Census_clean_testd_predictnew[i]-list(Census_clean_testd['H23.B'])[i] for i in range(0,len(Census_clean_testd_predictnew))]
for i in range(len(mc)):
    mc[i] =mc[i]**2
CensussvmlighttestdMSE=sum(mc)/len(Census_clean_testd_predictnew)

# Compute R_square by using svmlight(testd)
ssrc=[list(Census_clean_testd['H23.B'])[i]-Census_clean_testd_predictnew[i] for i in range(len(Census_clean_testd_predictnew))]
for i in range(len(ssrc)):
    ssrc[i] =ssrc[i]**2
CensussvmlighttestdSSR=sum(ssrc)

sstc=[list(Census_clean_testd['H23.B'])[i]-np.mean(list(Census_clean_testd['H23.B'])) for i in range(len(list(Census_clean_testd['H23.B'])))]
for i in range(len(sstc)):
    sstc[i] =sstc[i]**2
CensussvmlighttestdSST=sum(sstc)

CensussvmlighttestdR_square=1-(CensussvmlighttestdSSR/CensussvmlighttestdSST)
In [46]:
# Running time

image.png

In [47]:
print('The Mean squared error of Census(train set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlightMSE)+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttraindR_square)+' (regression)')
print('The Mean squared error of Census(test set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttestdMSE)+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttestdR_square)+' (regression)')
print('The time of Census by using Svmlight- Gaussian Kernel is '+str(29.16)+' seconds')
The Mean squared error of Census(train set) by using Svmlight- Gaussian Kernel is 0.0022167978444492657 (regression)
The Squared correlation coefficient of Census(train set) by using Svmlight- Gaussian Kernel is 0.805010753608935 (regression)
The Mean squared error of Census(test set) by using Svmlight- Gaussian Kernel is 0.0029487871683357035 (regression)
The Squared correlation coefficient of Census(test set) by using Svmlight- Gaussian Kernel is 0.71562768630384 (regression)
The time of Census by using Svmlight- Gaussian Kernel is 29.16 seconds
In [ ]:
 

Adult

Normal Liblinear

In [48]:
from liblinear.liblinearutil import train, predict
m_liblinearAdult = train(Adulttrain_y, Adulttrain_x_array, '-s 0 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Adulttrain_y, Adulttrain_x_array, m_liblinearAdult)
Accuracy = 84.9446% (25621/30162) (classification)
In [49]:
p_test_label, p_test_acc, p_test_val = predict(Adulttest_y, Adulttest_x_array, m_liblinearAdult)
Accuracy = 84.8606% (12780/15060) (classification)
In [50]:
import time
start = time.process_time()
train(Adulttrain_y, Adulttrain_x_array, '-s 0 -c 4')
end = time.process_time()
print (end-start)
1.03125
In [51]:
print('The Accuracy of Adult(train set) by using Normal Liblinear is '+str(p_train_acc[0])+' (classification)')
print('The Accuracy of Adult(test set) by using Normal Liblinear is '+str(p_test_acc[0])+' (classification)')
print('The time of Adult by using Normal Liblinear is '+str(end-start)+' seconds')
The Accuracy of Adult(train set) by using Normal Liblinear is 84.94463231881176 (classification)
The Accuracy of Adult(test set) by using Normal Liblinear is 84.86055776892431 (classification)
The time of Adult by using Normal Liblinear is 1.03125 seconds

LibSVM - Gaussian Kernel

In [52]:
# LibSVM - Gaussian Kernel
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmAdult_gaussian = svm_train(Adulttrain_y, Adulttrain_x_array, '-s 0 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(Adulttrain_y, Adulttrain_x_array, m_libsvmAdult_gaussian)
Accuracy = 83.6682% (25236/30162) (classification)
In [53]:
p_test_label, p_test_acc, p_test_val = svm_predict(Adulttest_y, Adulttest_x_array, m_libsvmAdult_gaussian)
Accuracy = 83.5923% (12589/15060) (classification)
In [54]:
import time
start = time.process_time()
svm_train(Adulttrain_y, Adulttrain_x_array, '-s 0 -t 2 -c 4')
end = time.process_time()
print (end-start)
44.1875
In [55]:
print('The Accuracy of Adult(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[0])+' (classification)')
print('The Accuracy of Adult(test set) by using LibSVM - Gaussian Kernel is '+str(p_test_acc[0])+' (classification)')
print('The time of Adult by using LibSVM - Gaussian Kernel is '+str(end-start)+' seconds')
The Accuracy of Adult(train set) by using LibSVM - Gaussian Kernel is 83.66819176447186 (classification)
The Accuracy of Adult(test set) by using LibSVM - Gaussian Kernel is 83.59229747675963 (classification)
The time of Adult by using LibSVM - Gaussian Kernel is 44.1875 seconds

Svmlight- Gaussian Kernel

In [56]:
# powershell
##  ./svm_learn.exe -t 2 adultdata/adulttrain_data adultdata/model
## ./svm_classify.exe adultdata/adulttrain_data adultdata/model adultdata/predicidons
## ./svm_classify.exe adultdata/adulttest_data adultdata/model adultdata/predicidons2

Model Establish

image.png

image.png

Adult_train_dataset

image.png

Adult_test_dataset

image.png

CPU

Normal liblinear

In [57]:
from liblinear.liblinearutil import train, predict
m_liblinearCPU = train(CPUtrain_y, CPUtrain_x_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(CPUtrain_y, CPUtrain_x_array, m_liblinearCPU)
Mean squared error = 0.0252611 (regression)
Squared correlation coefficient = 0.560471 (regression)
In [58]:
p_test_label, p_test_acc, p_test_val = predict(CPUtest_y, CPUtest_x_array, m_liblinearCPU)
Mean squared error = 0.0250452 (regression)
Squared correlation coefficient = 0.59684 (regression)
In [59]:
import time
start = time.process_time()
train(CPUtrain_y, CPUtrain_x_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
0.453125
In [60]:
print('The Mean squared error of CPU(train set) by using Normal liblinear is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using Normal liblinear is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of CPU(test set) by using Normal liblinear is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using Normal liblinear is '+str(p_test_acc[2])+' (regression)')
print('The time of CPU by using Normal liblinear is '+str(end-start)+' seconds')
The Mean squared error of CPU(train set) by using Normal liblinear is 0.025261127373393342 (regression)
The Squared correlation coefficient of CPU(train set) by using Normal liblinear is 0.5604712023805933 (regression)
The Mean squared error of CPU(test set) by using Normal liblinear is 0.025045205853192937 (regression)
The Squared correlation coefficient of CPU(test set) by using Normal liblinear is 0.5968401519963905 (regression)
The time of CPU by using Normal liblinear is 0.453125 seconds

LibSVM - Gaussian Kernel

In [61]:
# CPU
In [62]:
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmLRCPU_gaussian = svm_train(CPUtrain_y, CPUtrain_x_array, '-s 3 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(CPUtrain_y, CPUtrain_x_array, m_libsvmLRCPU_gaussian)
Mean squared error = 0.00192993 (regression)
Squared correlation coefficient = 0.963821 (regression)
In [63]:
p_test_label, p_test_acc, p_test_val = svm_predict(CPUtest_y, CPUtest_x_array, m_libsvmLRCPU_gaussian)
Mean squared error = 0.00192227 (regression)
Squared correlation coefficient = 0.96754 (regression)
In [64]:
start = time.process_time()
svm_train(CPUtrain_y, CPUtrain_x_array, '-s 3 -t 2 -c 4')
end = time.process_time()
print (end-start)
0.265625
In [65]:
print('The Mean squared error of CPU(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of CPU(test set) by usingLibSVM - Gaussian Kernel is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using LibSVM - Gaussian Kernel is '+str(p_test_acc[2])+' (regression)')
print('The time of CPU by using LibSVM - Gaussian Kernel is '+str(end-start)+' seconds')
The Mean squared error of CPU(train set) by using LibSVM - Gaussian Kernel is 0.0019299252462306807 (regression)
The Squared correlation coefficient of CPU(train set) by using LibSVM - Gaussian Kernel is 0.963820845979315 (regression)
The Mean squared error of CPU(test set) by usingLibSVM - Gaussian Kernel is 0.0019222732930495806 (regression)
The Squared correlation coefficient of CPU(test set) by using LibSVM - Gaussian Kernel is 0.9675399119685328 (regression)
The time of CPU by using LibSVM - Gaussian Kernel is 0.265625 seconds

Svmlight- Gaussian Kernel

In [66]:
# powershell
## ./svm_learn.exe -z r -t 2 -c 4 CPUdata/CPU_clean_traind_svmlight.txt CPUdata/model
## ./svm_classify.exe CPUdata/CPU_clean_traind_svmlight.txt CPUdata/model CPUdata/predicidons
## ./svm_classify.exe CPUdata/CPU_clean_testd_svmlight.txt CPUdata/model CPUdata/predicidons2
In [67]:
del mc
del ssrc
del sstc
# Read the prediction txt file save as CPU_clean_traind_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\CPUdata\\CPU_clean_train_predict.txt", "r") as f:
    CPU_clean_traind_predict = f.readlines()
    
# Delete the ā€œ\nā€ in the list and change str into float format
CPU_clean_traind_predictnew=[]
for i in range(len(CPU_clean_traind_predict)):
    CPU_clean_traind_predictnew.append(float(CPU_clean_traind_predict[i].strip('\n')))

# Compute MSE by using svmlight
mc=[CPU_clean_traind_predictnew[i]-list(CPU_clean_traind['usr'])[i] for i in range(0,len(CPU_clean_traind_predictnew))]
for i in range(len(mc)):
    mc[i] =mc[i]**2
CPUsvmlighttraindMSE=sum(mc)/len(CPU_clean_traind_predictnew)

# Compute R_square by using svmlight(train set)
ssrc=[CPU_clean_traind_predictnew[i]-np.mean(list(CPU_clean_traind['usr'])) for i in range(0,len(CPU_clean_traind_predictnew))]
for i in range(len(ssrc)):
    ssrc[i] =ssrc[i]**2
CPUsvmlighttraindSSR=sum(ssrc)

sstc=[list(CPU_clean_traind['usr'])[i]-np.mean(list(CPU_clean_traind['usr'])) for i in range(0,len(list(CPU_clean_traind['usr'])))]
for i in range(len(sstc)):
    sstc[i] =sstc[i]**2
CPUsvmlighttraindSST=sum(sstc)

CPUsvmlighttraindR_square=CPUsvmlighttraindSSR/CPUsvmlighttraindSST
In [68]:
del mc
del ssrc
del sstc
# Read the prediction txt file save as CPU_clean_testd_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\CPUdata\\CPU_clean_testd_predict.txt", "r") as f:
    CPU_clean_testd_predict = f.readlines()
    
# Delete the ā€œ\nā€ in the list and change str into float format
CPU_clean_testd_predictnew=[]
for i in range(len(CPU_clean_testd_predict)):
    CPU_clean_testd_predictnew.append(float(CPU_clean_testd_predict[i].strip('\n')))
    
# Compute MSE by using svmlight
mc=[CPU_clean_testd_predictnew[i]-list(CPU_clean_testd['usr'])[i] for i in range(0,len(CPU_clean_testd_predictnew))]
for i in range(len(mc)):
    mc[i] =mc[i]**2
CPUsvmlighttestdMSE=sum(mc)/len(CPU_clean_testd_predictnew)

# Compute R_square by using svmlight(test set)
ssrc=[CPU_clean_testd_predictnew[i]-np.mean(list(CPU_clean_testd['usr'])) for i in range(0,len(CPU_clean_testd_predictnew))]
for i in range(len(ssrc)):
    ssrc[i] =ssrc[i]**2
CPUsvmlighttestdSSR=sum(ssrc)

sstc=[list(CPU_clean_testd['usr'])[i]-np.mean(list(CPU_clean_testd['usr'])) for i in range(0,len(list(CPU_clean_testd['usr'])))]
for i in range(len(sstc)):
    sstc[i] =sstc[i]**2
CPUsvmlighttestdSST=sum(sstc)

CPUsvmlighttestdR_square=CPUsvmlighttestdSSR/CPUsvmlighttestdSST
In [69]:
# Running time

image.png

In [70]:
print('The Mean squared error of CPU(train set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttraindMSE)+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttraindR_square)+' (regression)')
print('The Mean squared error of CPU(test set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttestdMSE)+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttestdR_square)+' (regression)')
print('The time of CPU by using Svmlight- Gaussian Kernel is '+str(0.24)+' seconds')
The Mean squared error of CPU(train set) by using Svmlight- Gaussian Kernel is 0.0013529777856506335 (regression)
The Squared correlation coefficient of CPU(train set) by using Svmlight- Gaussian Kernel is 0.8924923244784451 (regression)
The Mean squared error of CPU(test set) by using Svmlight- Gaussian Kernel is 0.0013903350377532683 (regression)
The Squared correlation coefficient of CPU(test set) by using Svmlight- Gaussian Kernel is 0.8890872239562991 (regression)
The time of CPU by using Svmlight- Gaussian Kernel is 0.24 seconds
In [ ]:
 
In [ ]:
 

Liblinear+RFF

In [71]:
# Define the RFF function
data = '../data/'
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from scipy.stats import cauchy, laplace
from sklearn.metrics.pairwise import rbf_kernel, laplacian_kernel

class RFF(BaseEstimator):
    def __init__(self, gamma = 0.01, D = 500, metric = "rbf"):
        self.gamma = gamma
        self.metric = metric
        self.D = D
        self.fitted = False
        
    def fit(self, X):
        """ Generates MonteCarlo random samples """
        d = X.shape[1]
        if self.metric == "rbf":
            self.w = np.sqrt(2*self.gamma)*np.random.normal(size=(self.D,d))
        
        self.u = 2*np.pi*np.random.rand(self.D)
        
        self.fitted = True

        return self
    
    def transform(self,X):
        """ Transforms the data X (n_samples, n_features) to the new map space Z(X) (n_samples, n_components)"""
        if not self.fitted:
            raise NotFittedError("RBF_MonteCarlo must be fitted beform computing the feature map Z")
        Z = np.sqrt(2/self.D)*np.cos((X.dot(self.w.T) + self.u[np.newaxis,:]))
        print(Z.shape)
        return Z

Census

In [72]:
# Census
# Delete the Label and generate features dataset 
Census_clean_traind_x=Census_clean_traind.iloc[:,0:-1]
Census_clean_testd_x=Census_clean_testd.iloc[:,0:-1]

# Generate rff feature dataset
rff = RFF()
rff.fit(Census_clean_traind_x)
Census_clean_traind_x_rff = rff.transform(Census_clean_traind_x)
Census_clean_testd_x_rff = rff.transform(Census_clean_testd_x)

# Change the rff feature dataset into array format
Census_clean_traind_x_rff2 = Census_clean_traind_x_rff.copy()
Census_clean_traind_x_rff_array = np.array(Census_clean_traind_x_rff2)
Census_clean_testd_x_rff2 = Census_clean_testd_x_rff.copy()
Census_clean_testd_x_rff_array = np.array(Census_clean_testd_x_rff2)
(18227, 500)
(4557, 500)
In [73]:
from liblinear.liblinearutil import train, predict
m_liblinearCensus = train(Censustrain_y, Census_clean_traind_x_rff_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Censustrain_y, Census_clean_traind_x_rff_array, m_liblinearCensus)
Mean squared error = 0.000830329 (regression)
Squared correlation coefficient = 0.931541 (regression)
In [74]:
p_test_label, p_test_acc, p_test_val = predict(Censustest_y, Census_clean_testd_x_rff_array, m_liblinearCensus)
Mean squared error = 0.000850214 (regression)
Squared correlation coefficient = 0.926012 (regression)
In [75]:
import time
start = time.process_time()
train(Censustrain_y, Census_clean_traind_x_rff_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
11.796875

Adult

In [76]:
# Adult
# Delete the Label and generate features dataset 
Adult_clean_traind_x=adult_traind.iloc[:,0:-1]
Adult_clean_testd_x=adult_testd.iloc[:,0:-1]

# Generate rff feature dataset
rff = RFF()
rff.fit(Adult_clean_traind_x)
Adult_clean_traind_x_rff = rff.transform(Adult_clean_traind_x)
Adult_clean_testd_x_rff = rff.transform(Adult_clean_testd_x)

# Change the rff feature dataset into array format
Adult_clean_traind_x_rff2 = Adult_clean_traind_x_rff.copy()
Adult_clean_traind_x_rff_array = np.array(Adult_clean_traind_x_rff2)
Adult_clean_testd_x_rff2 = Adult_clean_testd_x_rff.copy()
Adult_clean_testd_x_rff_array = np.array(Adult_clean_testd_x_rff2)
(30162, 500)
(15060, 500)
In [77]:
from liblinear.liblinearutil import train, predict
m_liblinearAdult = train(Adulttrain_y, Adult_clean_traind_x_rff_array, '-s 0 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Adulttrain_y, Adult_clean_traind_x_rff_array, m_liblinearAdult)
Accuracy = 83.824% (25283/30162) (classification)
In [78]:
p_test_label, p_test_acc, p_test_val = predict(Adulttest_y,Adult_clean_testd_x_rff_array, m_liblinearAdult)
Accuracy = 83.7849% (12618/15060) (classification)
In [79]:
import time
start = time.process_time()
train(Adulttrain_y, Adult_clean_traind_x_rff_array, '-s 0 -c 4')
end = time.process_time()
print (end-start)
21.34375

CPU

In [80]:
# CPU
# Delete the Label and generate features dataset 
CPU_clean_traind_x=CPU_clean_traind.iloc[:,0:-1]
CPU_clean_testd_x=CPU_clean_testd.iloc[:,0:-1]

# Generate rff feature dataset
rff = RFF()
rff.fit(CPU_clean_traind_x)
CPU_clean_traind_x_rff = rff.transform(CPU_clean_traind_x)
CPU_clean_testd_x_rff = rff.transform(CPU_clean_testd_x)

# Change the rff feature dataset into array format
CPU_clean_traind_x_rff2 = CPU_clean_traind_x_rff.copy()
CPU_clean_traind_x_rff_array = np.array(CPU_clean_traind_x_rff2)
CPU_clean_testd_x_rff2 = CPU_clean_testd_x_rff.copy()
CPU_clean_testd_x_rff_array = np.array(CPU_clean_testd_x_rff2)
(6553, 500)
(1639, 500)
In [81]:
from liblinear.liblinearutil import train, predict
m_liblinearCPU = train(CPUtrain_y, CPU_clean_traind_x_rff_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(CPUtrain_y, CPU_clean_traind_x_rff_array, m_liblinearCPU)
Mean squared error = 0.00386527 (regression)
Squared correlation coefficient = 0.928472 (regression)
In [82]:
p_test_label, p_test_acc, p_test_val = predict(CPUtest_y, CPU_clean_testd_x_rff_array, m_liblinearCPU)
Mean squared error = 0.00385966 (regression)
Squared correlation coefficient = 0.935474 (regression)
In [83]:
import time
start = time.process_time()
train(CPUtrain_y, CPU_clean_traind_x_rff_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
4.21875
In [84]:
end0 = time.process_time()
print (end0-start0)
738.40625

Z-score Normalization

Census_clean=Census_Dataset.drop(['State','Code','H23.A','H23.C','H24','P4.1','P4.2','P4.3','P4.4','H4.1','H4.2','H4.3','H4.4','H35.1'], axis = 1) Census_cleanZ_score = (Census_clean - Census_clean.mean()) / (Census_clean.std())

Onehotc_adult_clean=pd.get_dummies(combineadultclean,dtype='int8').drop(['Income<=50K'], axis = 1) Onehotc_adultclean['Income>50K']=Onehotc_adultclean['Income>50K'].replace(0, -1) Onehotc_adult_cleanZ_score = (Onehotc_adult_clean- Onehotc_adult_clean.mean()) / (Onehotc_adult_clean.std())

CPU_clean=CPU.drop(['time'], axis = 1) CPU_cleanZ_score = (CPU_clean - CPU_clean.mean()) / (CPU_clean.std())

In [ ]: